Importing Read Count Table

We first copy the final_project_featCounts_mouse_genes_stranded_after_comments_from_merv.txt (found in SCU at /home/nib4003/ANGSD_2021_hw/final_project/final_project_featCounts_mouse_genes_stranded_after_comments_from_merv.txt) file from the SCU to the R project folder we are using for final project files. We then renamed this file to final_project_featCounts. This allows us to read in the files easily and keep the data together.

Below we import the libraries we will need for our analyses, and also show the readcounts table.

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.4
## Warning: package 'tidyr' was built under R version 4.0.4
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.4
## Warning: package 'forcats' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.0.3
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 4.0.3
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.0.3
## Loading required package: stats4
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.0.3
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.0.3
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Warning: package 'GenomicRanges' was built under R version 4.0.3
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.0.4
## Loading required package: SummarizedExperiment
## Warning: package 'SummarizedExperiment' was built under R version 4.0.3
## Loading required package: MatrixGenerics
## Warning: package 'MatrixGenerics' was built under R version 4.0.3
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.0.3
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 4.0.3
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.0.4
library(org.Sc.sgd.db)
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 4.0.3
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
library(EnhancedVolcano)
## Warning: package 'EnhancedVolcano' was built under R version 4.0.3
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.0.3
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.0.3
readcounts <- paste0("final_project_featCounts.txt") %>% read.table(., header=TRUE)

The sample names are a bit annoying since they contain the entire SAM file name. Therefore, we label theses with more useful identifiers corresponding to diabetic, heterozygous, and wild-type samples.

# Keep a back-up copy of the original names
orig_names <- names(readcounts) 
# Change names 
names(readcounts) <-c(names(readcounts)[1:6],paste("DIABETES",c(1:3), sep = "_"),paste("HETEROZYGOUS",c(1:3), sep = "_"),paste("WT",c(1:3),sep="_") )
str(readcounts)
## 'data.frame':    55487 obs. of  15 variables:
##  $ Geneid        : chr  "ENSMUSG00000102693.1" "ENSMUSG00000064842.1" "ENSMUSG00000051951.5" "ENSMUSG00000102851.1" ...
##  $ Chr           : chr  "chr1" "chr1" "chr1;chr1;chr1;chr1;chr1;chr1;chr1" "chr1" ...
##  $ Start         : chr  "3073253" "3102016" "3205901;3206523;3213439;3213609;3214482;3421702;3670552" "3252757" ...
##  $ End           : chr  "3074322" "3102125" "3207317;3207317;3215632;3216344;3216968;3421901;3671498" "3253236" ...
##  $ Strand        : chr  "+" "+" "-;-;-;-;-;-;-" "+" ...
##  $ Length        : int  1070 110 6094 480 2819 2233 2309 250 2057 926 ...
##  $ DIABETES_1    : int  0 0 163 0 25 14 7 0 21 0 ...
##  $ DIABETES_2    : int  0 0 187 0 32 14 8 0 16 0 ...
##  $ DIABETES_3    : int  0 0 169 0 12 11 12 0 17 0 ...
##  $ HETEROZYGOUS_1: int  0 0 230 0 45 19 13 0 19 0 ...
##  $ HETEROZYGOUS_2: int  0 0 198 0 38 12 14 0 13 0 ...
##  $ HETEROZYGOUS_3: int  0 0 232 0 47 16 12 0 19 0 ...
##  $ WT_1          : int  0 0 174 0 38 23 11 0 21 0 ...
##  $ WT_2          : int  0 0 146 0 39 14 11 2 13 0 ...
##  $ WT_3          : int  0 0 166 0 38 19 12 0 26 0 ...

CountData

In principle, readcounts is pretty much already in the format that we will need (columns = Samples, rows = genes), but we are missing row.names and the first couple of columns contain meta data information that need to be separated from the counts (e.g. gene Ids, gene lengths, etc.). We perform these operations below.

row.names(readcounts) <- make.names(readcounts$Geneid)
## exclude the columns without read counts (columns 1 to 6 contain additional
## info such as genomic coordinates)
readcounts <- readcounts[ ,-c(1:6)]

head(readcounts)
##                      DIABETES_1 DIABETES_2 DIABETES_3 HETEROZYGOUS_1
## ENSMUSG00000102693.1          0          0          0              0
## ENSMUSG00000064842.1          0          0          0              0
## ENSMUSG00000051951.5        163        187        169            230
## ENSMUSG00000102851.1          0          0          0              0
## ENSMUSG00000103377.1         25         32         12             45
## ENSMUSG00000104017.1         14         14         11             19
##                      HETEROZYGOUS_2 HETEROZYGOUS_3 WT_1 WT_2 WT_3
## ENSMUSG00000102693.1              0              0    0    0    0
## ENSMUSG00000064842.1              0              0    0    0    0
## ENSMUSG00000051951.5            198            232  174  146  166
## ENSMUSG00000102851.1              0              0    0    0    0
## ENSMUSG00000103377.1             38             47   38   39   38
## ENSMUSG00000104017.1             12             16   23   14   19

Below we create a dataframe for the gene information for each sample.

 #let's use the info from our readcounts objects
sample_info <- DataFrame(condition =gsub("_[0-9]+", "",names(readcounts)),row.names =names(readcounts) )

sample_info # rows contain gene information for each sample
## DataFrame with 9 rows and 1 column
##                   condition
##                 <character>
## DIABETES_1         DIABETES
## DIABETES_2         DIABETES
## DIABETES_3         DIABETES
## HETEROZYGOUS_1 HETEROZYGOUS
## HETEROZYGOUS_2 HETEROZYGOUS
## HETEROZYGOUS_3 HETEROZYGOUS
## WT_1                     WT
## WT_2                     WT
## WT_3                     WT

Generate the DESeqDataSet

Below we generate the DESeqDataSet from our counts for all the samples.

DESeq.ds <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts), colData = sample_info, design = ~ condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds
## class: DESeqDataSet 
## dim: 55487 9 
## metadata(1): version
## assays(1): counts
## rownames(55487): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000095019.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(9): DIABETES_1 DIABETES_2 ... WT_2 WT_3
## colData names(1): condition
head(counts(DESeq.ds))
##                      DIABETES_1 DIABETES_2 DIABETES_3 HETEROZYGOUS_1
## ENSMUSG00000102693.1          0          0          0              0
## ENSMUSG00000064842.1          0          0          0              0
## ENSMUSG00000051951.5        163        187        169            230
## ENSMUSG00000102851.1          0          0          0              0
## ENSMUSG00000103377.1         25         32         12             45
## ENSMUSG00000104017.1         14         14         11             19
##                      HETEROZYGOUS_2 HETEROZYGOUS_3 WT_1 WT_2 WT_3
## ENSMUSG00000102693.1              0              0    0    0    0
## ENSMUSG00000064842.1              0              0    0    0    0
## ENSMUSG00000051951.5            198            232  174  146  166
## ENSMUSG00000102851.1              0              0    0    0    0
## ENSMUSG00000103377.1             38             47   38   39   38
## ENSMUSG00000104017.1             12             16   23   14   19

Below we show the number of reads sequenced for each sample.

colSums(counts(DESeq.ds)) %>% barplot

Remove genes with no reads:

dim(DESeq.ds)
## [1] 55487     9
keep_genes <- rowSums(counts(DESeq.ds)) > 0
DESeq.ds <- DESeq.ds[ keep_genes, ]
dim(DESeq.ds)
## [1] 30372     9

Assumptions of DESeq’s size factor method:

Calculating and applying the size factor

DESeq.ds <- estimateSizeFactors(DESeq.ds)
# calculate SFs, add them to object
plot(sizeFactors(DESeq.ds),colSums(counts(DESeq.ds)), ylab = "library sizes", xlab = "size factors", cex = .6 )

Reducing the dependence of the variance on the mean

DESeq offers two ways to shrink the log-transformed counts for genes with very low counts: rlog and varianceStabilizingTransformation(vst).

We’ll use rlog here as it is an optimized method for RNA-seq read counts: it transforms the read counts to the log2 scale while simultaneously minimizing the difference between samples for rows with small counts and taking differences between library sizes of the samples into account. More specifically, the expression values are modeled in such a manner that their dispersion is not based on the actual variability an individual gene may show across its replicates, instead, its variability is based on the general dispersion-mean trend over the entire dataset.This is possible because DESeq2 assumes that ‘genes of similar average expression strength have similar dispersion"; this is important as it allows us to base our estimation of the noise on a much larger data set than the original number of replicates. For example, in our dataset, we only have 10 values per gene. That is not a lot of values to robustly estimate the noise; however, following DESeq2’s assumption, we can increase our number of data points significantly by assuming that all genes that share similar average expression values across all samples should also display similar noise levels. This assumption works very well on a global scale, but it also means that you cannot take the rlog values at face value when looking at single genes, because their values are a strong reflection of all of the values for genes in the same expression strength bin.The vst methods tends to depend a bit more on the differences in sequencing depths, but generally, both methods should return similar results.

# this actually generates a different type of object!
DESeq.rlog <-rlog(DESeq.ds, blind = TRUE)
## set blind = FALSE if the conditions are expected to introduce
## strong differences in a large proportion of the genes

Below we visually check the results of the rlog transformation for diabetes sample 1 and diabetes sample 2.

par(mfrow=c(1,2))


## the rlog-transformed counts are stored in the accessor "assay"
plot(assay(DESeq.rlog)[,1],assay(DESeq.rlog)[,2],cex=.1, main = "rlog transformed",xlab =colnames(assay(DESeq.rlog[,1])),ylab =colnames(assay(DESeq.rlog[,2])) )

Below we plot the first two principal components, which shows us that our samples are shown to have features which group them very well.

# We set ntop = 500 (the default) to show this represents the top 500 most variable genes used by the PCA
#jpeg(file="PCA_plot.jpeg")
plotPCA(DESeq.rlog, ntop = 500) 

#dev.off()

Using Just Wild Type vs. Heterozygous

Below we take the columns corresponding to the heterozygous and wild-type mice only for a direct comparison between these two sample types.

## Take only the last 6 columns (heterozygous and wild type)
readcounts_het_wt <- readcounts[ , c(4:9)]

head(readcounts_het_wt)
##                      HETEROZYGOUS_1 HETEROZYGOUS_2 HETEROZYGOUS_3 WT_1 WT_2
## ENSMUSG00000102693.1              0              0              0    0    0
## ENSMUSG00000064842.1              0              0              0    0    0
## ENSMUSG00000051951.5            230            198            232  174  146
## ENSMUSG00000102851.1              0              0              0    0    0
## ENSMUSG00000103377.1             45             38             47   38   39
## ENSMUSG00000104017.1             19             12             16   23   14
##                      WT_3
## ENSMUSG00000102693.1    0
## ENSMUSG00000064842.1    0
## ENSMUSG00000051951.5  166
## ENSMUSG00000102851.1    0
## ENSMUSG00000103377.1   38
## ENSMUSG00000104017.1   19

Below we create a dataframe for the gene information for each sample.

 #let's use the info from our readcounts objects
sample_info_het_wt <- DataFrame(condition_het_wt =gsub("_[0-9]+", "",names(readcounts_het_wt)),row.names =names(readcounts_het_wt) )

sample_info_het_wt # rows contain gene information for each sample
## DataFrame with 6 rows and 1 column
##                condition_het_wt
##                     <character>
## HETEROZYGOUS_1     HETEROZYGOUS
## HETEROZYGOUS_2     HETEROZYGOUS
## HETEROZYGOUS_3     HETEROZYGOUS
## WT_1                         WT
## WT_2                         WT
## WT_3                         WT

Generate the DESeqDataSet for heterozygous and wild type only

Below we generate the DESeqDataSet from our counts for the wild-type and heterozygous mice.

DESeq.ds_het_wt <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts_het_wt), colData = sample_info_het_wt, design = ~ condition_het_wt)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds_het_wt
## class: DESeqDataSet 
## dim: 55487 6 
## metadata(1): version
## assays(1): counts
## rownames(55487): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000095019.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(6): HETEROZYGOUS_1 HETEROZYGOUS_2 ... WT_2 WT_3
## colData names(1): condition_het_wt
head(counts(DESeq.ds_het_wt))
##                      HETEROZYGOUS_1 HETEROZYGOUS_2 HETEROZYGOUS_3 WT_1 WT_2
## ENSMUSG00000102693.1              0              0              0    0    0
## ENSMUSG00000064842.1              0              0              0    0    0
## ENSMUSG00000051951.5            230            198            232  174  146
## ENSMUSG00000102851.1              0              0              0    0    0
## ENSMUSG00000103377.1             45             38             47   38   39
## ENSMUSG00000104017.1             19             12             16   23   14
##                      WT_3
## ENSMUSG00000102693.1    0
## ENSMUSG00000064842.1    0
## ENSMUSG00000051951.5  166
## ENSMUSG00000102851.1    0
## ENSMUSG00000103377.1   38
## ENSMUSG00000104017.1   19

Below we show the number of reads sequenced for each sample.

colSums(counts(DESeq.ds_het_wt)) %>% barplot

Remove genes with no reads:

dim(DESeq.ds_het_wt)
## [1] 55487     6
keep_genes_het_wt <- rowSums(counts(DESeq.ds_het_wt)) > 0
DESeq.ds_het_wt <- DESeq.ds_het_wt[ keep_genes_het_wt, ]
dim(DESeq.ds_het_wt)
## [1] 28515     6

Below we change the names of the genes from the ensembl IDs to the external gene names. We do this using the getBM package from BiocManager and using the biomaRt library. We first make a character vector of the rownames, corresponding to the Ensemble geneIds. We then use the BioMart database to map the Ensemble geneIds to the common names of the genes for mice (mmusculus). Finally, we replace the rownames of the current readcounts with the updated names. The reference is https://www.biostars.org/p/147351/. The output for mapped_names is a vector of length less than that of the total number of genes in the readcounts matrix. This is due to some Ensembl names not mapping to an external gene name. To incorporate all the genes, we must merge the two dataframes such that the corresponding external gene name is NA for an Ensembl name initially without an external gene name. This allows us to compare our results with the paper.

#BiocManager::install("getBM")
library(biomaRt)
## Warning: package 'biomaRt' was built under R version 4.0.3
ens_het_wt <- c(rownames(DESeq.ds_het_wt))
ens_het_wt <- gsub('\\..*', '', ens_het_wt)
mouse = useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
mapped_names <- getBM(attributes=c('ensembl_gene_id',
                      'external_gene_name'),
         values = ens_het_wt,
         mart = mouse)
ens_df_het_wt <- as.data.frame(ens_het_wt)
colnames(ens_df_het_wt) <- "ensembl_gene_id"
idmap_het_wt = merge(x = ens_df_het_wt, y = mapped_names, by="ensembl_gene_id", all.x=TRUE)
idmap_het_wt$external_gene_name <- ifelse(is.na(idmap_het_wt$external_gene_name), idmap_het_wt$ensembl_gene_id, idmap_het_wt$external_gene_name)
row.names(DESeq.ds_het_wt) <- make.names(idmap_het_wt[,2])

Calculating and applying the size factor for heterozygous and wild type only

Below we show the size factors of the samples.

DESeq.ds_het_wt <- estimateSizeFactors(DESeq.ds_het_wt)
# calculate SFs, add them to object
plot(sizeFactors(DESeq.ds_het_wt),colSums(counts(DESeq.ds_het_wt)), ylab = "library sizes", xlab = "size factors", cex = .6 )

Reducing the dependence of the variance on the mean

First, we use the rlog function which transforms the read counts to the log2 scale while simultaneously minimizing the difference between samples for rows with small counts and taking differences between library sizes of the samples into account. More specifically, the expression values are modeled in such a manner that their dispersion is not based on the actual variability an individual gene may show across its replicates, instead, its variability is based on the general dispersion-mean trend over the entire dataset.

# this actually generates a different type of object!
DESeq.rlog_het_wt <-rlog(DESeq.ds_het_wt, blind = TRUE)
## set blind = FALSE if the conditions are expected to introduce
## strong differences in a large proportion of the genes

Let’s visually check the results of the rlog transformation for heterozygous sample 1 and wild-type sample 1:

par(mfrow=c(1,2))


## the rlog-transformed counts are stored in the accessor "assay"
plot(assay(DESeq.rlog_het_wt)[,1],assay(DESeq.rlog_het_wt)[,2],cex=.1, main = "rlog transformed",xlab =colnames(assay(DESeq.rlog_het_wt[,1])),ylab =colnames(assay(DESeq.rlog_het_wt[,4])) )

Below we create an assay for the rlog-transform of the readcounts.

rlog.norm.counts_het_wt <-assay(DESeq.rlog_het_wt)

Below we show the effect of the rlog-transformed read counts.

## rlog-transformed read counts
msd_plot_het_wt <- vsn::meanSdPlot( rlog.norm.counts_het_wt, ranks=FALSE, plot = FALSE)
msd_plot_het_wt$gg+ ggtitle("rlog transformation") + coord_cartesian(ylim =c(0,3))

Below we relevel the data so that the reference is the wild-type samples.

DESeq.ds_het_wt
## class: DESeqDataSet 
## dim: 28515 6 
## metadata(1): version
## assays(1): counts
## rownames(28515): Gnai3 Cdc45 ... ENSMUSG00000118655 BX681418.1
## rowData names(0):
## colnames(6): HETEROZYGOUS_1 HETEROZYGOUS_2 ... WT_2 WT_3
## colData names(2): condition_het_wt sizeFactor
DESeq.ds_het_wt$condition_het_wt
## [1] HETEROZYGOUS HETEROZYGOUS HETEROZYGOUS WT           WT          
## [6] WT          
## Levels: HETEROZYGOUS WT
DESeq.ds_het_wt$condition_het_wt <- relevel(DESeq.ds_het_wt$condition_het_wt, ref = 'WT')

Now, let’s perform DE analysis!

DESeq.ds_het_wt <- DESeq(DESeq.ds_het_wt)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Below we look at the DE in a new way. We can obtain unfiltered GLM results, i.e. without outlier removal or independent filtering with the following call which results in no p-values having an NA as a result:

DGE.results_het_wt <- results(DESeq.ds_het_wt)
dim(DGE.results_het_wt)
## [1] 28515     6
# Find how many significant genes. There are a large number of genes that have NA because the program knows that they have no chance at being significant and therefore does not even produce an output 
table(DGE.results_het_wt$padj < 0.05, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
## 11809   672 16034

The top 3 most significant genes and their corresponding adjusted p-values are shown below.

row.names(DGE.results_het_wt)[which(DGE.results_het_wt$padj<1e-40)]
## [1] "Hdac8"          "Gm42536"        "X2610306O10Rik"
DGE.results_het_wt$padj[which(DGE.results_het_wt$padj<1e-40)]
## [1]  0.000000e+00  5.599315e-94 2.678267e-281

Below we create a heatmap using complexheatmap for all significant genes.

#BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Warning: package 'ComplexHeatmap' was built under R version 4.0.3
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.6.2
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## Warning: package 'circlize' was built under R version 4.0.4
## ========================================
## circlize version 0.4.12
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================
DGEgenes_het_wt <- subset(DGE.results_het_wt, padj < 0.05) %>% rownames
row.names(DESeq.rlog_het_wt) <- rownames(DGE.results_het_wt)
rlog.dge_het_wt <- DESeq.rlog_het_wt[DGEgenes_het_wt, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_wt), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        show_row_name = FALSE,
        cluster_columns = FALSE,
        split=cutGroups,
        column_title = "Heatmap For HET vs. WT Significant Genes")

MA-plots and volcano plots

The MA-plot provides a global view of the differential genes, with the log2 fold change on the y-axis over the mean of normalized counts. Genes that pass the significance threshold (adjusted p.value <0.05) are colored in blue.

plotMA(DGE.results_het_wt, alpha = 0.05, main = "Test: p.adj.value < 0.05 HET vs. WT")
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we create a volcano plot showing the significant genes with respect to logFC change and p-value in red for the comparison between heterozygous and wild-type mice.

library(EnhancedVolcano)
vp1_het_wt <-EnhancedVolcano(DGE.results_het_wt,lab =rownames(DGE.results_het_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05, title = "HET vs WT", xlim=c(-5,5), ylim = c(0,20))
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
print(vp1_het_wt)

Shrinking logFC

Adjusts for types of noise we might see and tends to be more reliable and more robust. We also find the upregulated and downregulated genes based on the log2 Fold Change for the genes that we have already determined to be significant by their p-value. There are 27 upregulated genes and 8 genes in total significant for both p-value and having a log2 Fold Change showing a significant downregulation. These are shown in the table below.

# BiocManager::install('apeglm')
DGE.results.shrink_het_wt <- lfcShrink(DESeq.ds_het_wt, coef = 2, type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
upregulated_genes_values_het_wt <- DGE.results.shrink_het_wt$log2FoldChange[DGE.results.shrink_het_wt$log2FoldChange > 1]
downregulated_genes_values_het_wt <- DGE.results.shrink_het_wt$log2FoldChange[DGE.results.shrink_het_wt$log2FoldChange < -1]

upregulated_genes_het_wt <- row.names(DGE.results.shrink_het_wt)[DGE.results.shrink_het_wt$log2FoldChange > 1]
upregulated_genes_values_het_wt_sorted <- upregulated_genes_values_het_wt[sort.list(upregulated_genes_values_het_wt)]
top_twentyfive_upregulated_genes_het_wt <- names(upregulated_genes_values_het_wt_sorted[1:25])

downregulated_genes_het_wt <- row.names(DGE.results.shrink_het_wt)[DGE.results.shrink_het_wt$log2FoldChange < -1]
downregulated_genes_values_het_wt_sorted <- downregulated_genes_values_het_wt[sort.list(downregulated_genes_values_het_wt)]
top_nine_downregulated_genes_het_wt <- names(downregulated_genes_values_het_wt_sorted[1:9])

finding_upregulated_genes_in_significant_genes <- upregulated_genes_het_wt %in% DGEgenes_het_wt
final_upregulated_genes_het_wt <- upregulated_genes_het_wt[finding_upregulated_genes_in_significant_genes]

finding_downregulated_genes_in_significant_genes <- downregulated_genes_het_wt %in% DGEgenes_het_wt
final_downregulated_genes_het_wt <- downregulated_genes_het_wt[finding_downregulated_genes_in_significant_genes]
print(final_upregulated_genes_het_wt)
##  [1] "Rnf114"             "Slc39a6"            "Slk"               
##  [4] "Arl8a"              "Ube2t"              "Auts2"             
##  [7] "Tmem63b"            "Cdh6"               "Cmklr1"            
## [10] "Pip5kl1"            "Zfp445"             "Trmt1l"            
## [13] "Ercc6"              "Gm9939"             "Gabra5"            
## [16] "Gm22739"            "Gm10570"            "Tldc2"             
## [19] "Fam83c"             "ENSMUSG00000077309" "ENSMUSG00000077829"
## [22] "Gm5210"             "Gm17771"            "Gm44163"           
## [25] "Gm44241"            "D9Wsu149"           "Gm49311"
print(final_downregulated_genes_het_wt)
## [1] "Brinp2"         "Gm24695"        "Hdac8"          "Gm37736"       
## [5] "Gm20239"        "Gm42536"        "Gm44041"        "X2610306O10Rik"
Significant Gene Name Upregulated or Downregulated
1 Rnf114 Upregulated
2 Slc39a6 Upregulated
3 Slk Upregulated
4 Arl8a Upregulated
5 Ube2t Upregulated
6 Auts2 Upregulated
7 Tmem63b Upregulated
8 Cdh6 Upregulated
9 Cmklr1 Upregulated
10 Pip5kl1 Upregulated
11 Zfp445 Upregulated
12 Trmt1l Upregulated
13 Ercc6 Upregulated
14 Gm9939 Upregulated
15 Gabra5 Upregulated
16 Gm22739 Upregulated
17 Gm10570 Upregulated
18 Tldc2 Upregulated
19 Fam83c Upregulated
20 ENSMUSG00000077309 (n-R5s8) Upregulated
21 ENSMUSG00000077829 (Gm23237) Upregulated
22 Gm5210 Upregulated
23 Gm17771 Upregulated
24 Gm44163 Upregulated
25 Gm44241 Upregulated
26 D9Wsu149 Upregulated
27 Gm49311 Upregulated
28 Brinp2 Downregulated
29 Gm24695 Downregulated
30 Hdac8 Downregulated
31 Gm37736 Downregulated
32 Gm20239 Downregulated
33 Gm42536 Downregulated
34 Gm44041 Downregulated
35 X2610306O10Rik Downregulated

Below we create a heatmap using complexheatmap for the top 25 upregulated genes. We see that heterozygous sample 3 is has a much greater contrast with the wild-type samples than the other two heterozgyous samples.

DGEgenes_het_wt_top_twentyfive_upregulated <- top_twentyfive_upregulated_genes_het_wt
rlog.dge_het_wt_top_twentyfive_upregulated <- DESeq.rlog_het_wt[DGEgenes_het_wt_top_twentyfive_upregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_wt_top_twentyfive_upregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "HET vs. WT: Top 25 Upregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we do the same for the 9 downregulated genes we concluded to have a logFC value less than -1.

DGEgenes_het_wt_top_nine_downregulated <- top_nine_downregulated_genes_het_wt
rlog.dge_het_wt_top_nine_downregulated <- DESeq.rlog_het_wt[DGEgenes_het_wt_top_nine_downregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_wt_top_nine_downregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "HET vs. WT: Top 9 Downregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Next we show the effect of the shrinkage on the MA plot:

plotMA(DGE.results.shrink_het_wt,alpha = 0.05, main = "with logFC shrinkage", ylim=c(-3,3))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

par(mfrow =c(1,2))
plotMA(DGE.results_het_wt, alpha = 0.05,main = "no shrinkage")
DGE.results.shrink_het_wt <-lfcShrink(DESeq.ds_het_wt, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')
plotMA(DGE.results.shrink_het_wt,alpha = 0.05, main = "with logFC shrinkage", ylim=c(-3,3))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we show a volcano plot after the logFC shrink operation. We notice that there are genes with p-values that are astronomically small. Therefore, we have reason to believe that there could have been a problem at some point along the pipeline. We also compare the gene expression differences between the heterozygous and wild-type mice with and without the logFC shrink. We see that with the shrinkage, we find a few more significant genes.

vp2_het_wt <- EnhancedVolcano(DGE.results.shrink_het_wt,lab =rownames(DGE.results.shrink_het_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05,title = "HET vs WT: with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
vp2_het_wt_with_cutoff <- EnhancedVolcano(DGE.results.shrink_het_wt,lab =rownames(DGE.results.shrink_het_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05,title = "HET vs WT: with logFC shrinkage", xlim=c(-5,5), ylim = c(0,20))
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
library(patchwork)
print(vp2_het_wt)

print(vp2_het_wt_with_cutoff)

vp1_het_wt+vp2_het_wt_with_cutoff

Using Just Diabetes vs. Heterozygous

Below we take the columns corresponding to the diabetic and heterozygous mice only for a direct comparison between these two sample types.

## Take only the first 6 columns (heterozygous and wild type)
readcounts_het_db <- readcounts[ , c(1:6)]

head(readcounts_het_db)
##                      DIABETES_1 DIABETES_2 DIABETES_3 HETEROZYGOUS_1
## ENSMUSG00000102693.1          0          0          0              0
## ENSMUSG00000064842.1          0          0          0              0
## ENSMUSG00000051951.5        163        187        169            230
## ENSMUSG00000102851.1          0          0          0              0
## ENSMUSG00000103377.1         25         32         12             45
## ENSMUSG00000104017.1         14         14         11             19
##                      HETEROZYGOUS_2 HETEROZYGOUS_3
## ENSMUSG00000102693.1              0              0
## ENSMUSG00000064842.1              0              0
## ENSMUSG00000051951.5            198            232
## ENSMUSG00000102851.1              0              0
## ENSMUSG00000103377.1             38             47
## ENSMUSG00000104017.1             12             16

Below we create a dataframe for the gene information for each sample.

 #let's use the info from our readcounts objects
sample_info_het_db <- DataFrame(condition_het_db =gsub("_[0-9]+", "",names(readcounts_het_db)),row.names =names(readcounts_het_db) )

sample_info_het_db # rows contain gene information for each sample
## DataFrame with 6 rows and 1 column
##                condition_het_db
##                     <character>
## DIABETES_1             DIABETES
## DIABETES_2             DIABETES
## DIABETES_3             DIABETES
## HETEROZYGOUS_1     HETEROZYGOUS
## HETEROZYGOUS_2     HETEROZYGOUS
## HETEROZYGOUS_3     HETEROZYGOUS

Generate the DESeqDataSet for heterozygous and diabetes only

Below we generate the DESeqDataSet from our counts for the heterozygous and diabetic mice.

DESeq.ds_het_db <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts_het_db), colData = sample_info_het_db, design = ~ condition_het_db)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds_het_db
## class: DESeqDataSet 
## dim: 55487 6 
## metadata(1): version
## assays(1): counts
## rownames(55487): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000095019.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... HETEROZYGOUS_2 HETEROZYGOUS_3
## colData names(1): condition_het_db
head(counts(DESeq.ds_het_db))
##                      DIABETES_1 DIABETES_2 DIABETES_3 HETEROZYGOUS_1
## ENSMUSG00000102693.1          0          0          0              0
## ENSMUSG00000064842.1          0          0          0              0
## ENSMUSG00000051951.5        163        187        169            230
## ENSMUSG00000102851.1          0          0          0              0
## ENSMUSG00000103377.1         25         32         12             45
## ENSMUSG00000104017.1         14         14         11             19
##                      HETEROZYGOUS_2 HETEROZYGOUS_3
## ENSMUSG00000102693.1              0              0
## ENSMUSG00000064842.1              0              0
## ENSMUSG00000051951.5            198            232
## ENSMUSG00000102851.1              0              0
## ENSMUSG00000103377.1             38             47
## ENSMUSG00000104017.1             12             16

Below we show the number of reads sequenced for each sample.

colSums(counts(DESeq.ds_het_db)) %>% barplot

Remove genes with no reads:

dim(DESeq.ds_het_db)
## [1] 55487     6
keep_genes_het_db <- rowSums(counts(DESeq.ds_het_db)) > 0
DESeq.ds_het_db <- DESeq.ds_het_db[ keep_genes_het_db, ]
dim(DESeq.ds_het_db)
## [1] 29035     6

Below we change the names of the genes from the ensembl IDs to the external gene names. We do this using the getBM package from BiocManager and using the biomaRt library. We first make a character vector of the rownames, corresponding to the Ensemble geneIds. We then use the BioMart database to map the Ensemble geneIds to the common names of the genes for mice (mmusculus). Finally, we replace the rownames of the current readcounts with the updated names. The reference is https://www.biostars.org/p/147351/. The output for mapped_names is a vector of length less than that of the total number of genes in the readcounts matrix. This is due to some Ensembl names not mapping to an external gene name. To incorporate all the genes, we must merge the two dataframes such that the corresponding external gene name is NA for an Ensembl name initially without an external gene name. This allows us to compare our results with the paper.

#BiocManager::install("getBM")
library(biomaRt)

ens_het_db <- c(rownames(DESeq.ds_het_db))
ens_het_db <- gsub('\\..*', '', ens_het_db)
mouse = useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
mapped_names <- getBM(attributes=c('ensembl_gene_id',
                      'external_gene_name'),
         values = ens_het_db,
         mart = mouse)
ens_df_het_db <- as.data.frame(ens_het_db)
colnames(ens_df_het_db) <- "ensembl_gene_id"
idmap_het_db = merge(x = ens_df_het_db, y = mapped_names, by="ensembl_gene_id", all.x=TRUE)
idmap_het_db$external_gene_name <- ifelse(is.na(idmap_het_db$external_gene_name), idmap_het_db$ensembl_gene_id, idmap_het_db$external_gene_name)
row.names(DESeq.ds_het_db) <- make.names(idmap_het_db[,2])

Calculating and applying the size factor for heterozygous and wild type only

Below we show the size factors of the samples.

DESeq.ds_het_db <- estimateSizeFactors(DESeq.ds_het_db)
# calculate SFs, add them to object
plot(sizeFactors(DESeq.ds_het_db),colSums(counts(DESeq.ds_het_db)), ylab = "library sizes", xlab = "size factors", cex = .6 )

Reducing the dependence of the variance on the mean

First, we use the rlog function which transforms the read counts to the log2 scale while simultaneously minimizing the difference between samples for rows with small counts and taking differences between library sizes of the samples into account. More specifically, the expression values are modeled in such a manner that their dispersion is not based on the actual variability an individual gene may show across its replicates, instead, its variability is based on the general dispersion-mean trend over the entire dataset.

# this actually generates a different type of object!
DESeq.rlog_het_db <-rlog(DESeq.ds_het_db, blind = TRUE)
## set blind = FALSE if the conditions are expected to introduce
## strong differences in a large proportion of the genes

Next we visually check the results of the rlog transformation for diabetes sample 1 and heterozygous sample 1:

par(mfrow=c(1,2))


## the rlog-transformed counts are stored in the accessor "assay"
plot(assay(DESeq.rlog_het_db)[,1],assay(DESeq.rlog_het_db)[,2],cex=.1, main = "rlog transformed",xlab =colnames(assay(DESeq.rlog_het_db[,1])),ylab =colnames(assay(DESeq.rlog_het_db[,4])) )

Below we create an assay for the rlog-transform of the readcounts.

rlog.norm.counts_het_db <-assay(DESeq.rlog_het_db)

Below we show the effect of the rlog-transformed read counts.

## rlog-transformed read counts
msd_plot_het_db <- vsn::meanSdPlot( rlog.norm.counts_het_db, ranks=FALSE, plot = FALSE)
msd_plot_het_db$gg+ ggtitle("rlog transformation") + coord_cartesian(ylim =c(0,3))

Below we relevel the data so that the reference is the heterozygous samples.

DESeq.ds_het_db
## class: DESeqDataSet 
## dim: 29035 6 
## metadata(1): version
## assays(1): counts
## rownames(29035): Gnai3 Cdc45 ... ENSMUSG00000118655 BX681418.1
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... HETEROZYGOUS_2 HETEROZYGOUS_3
## colData names(2): condition_het_db sizeFactor
DESeq.ds_het_db$condition_het_db
## [1] DIABETES     DIABETES     DIABETES     HETEROZYGOUS HETEROZYGOUS
## [6] HETEROZYGOUS
## Levels: DIABETES HETEROZYGOUS
DESeq.ds_het_db$condition_het_db <- relevel(DESeq.ds_het_db$condition_het_db, ref = 'HETEROZYGOUS')

Now, let’s perform DE analysis!

DESeq.ds_het_db <- DESeq(DESeq.ds_het_db)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Below we show the number of significant genes, non-significant genes, and genes that had no chance of being significant so they did not even have an adjusted p-value calculated for them (represented in the NA column).

DGE.results_het_db <- results(DESeq.ds_het_db)
dim(DGE.results_het_db)
## [1] 29035     6
# Find how many significant genes. There are ~% of genes that show significant DE 
table(DGE.results_het_db$padj < 0.05, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
## 14033  5993  9009

The top 4 most significant genes and their corresponding adjusted p-values are shown below.

row.names(DGE.results_het_db)[which(DGE.results_het_db$padj<1e-210)]
## [1] "Dixdc1"  "Als2cl"  "Gm13052" "Gm6789"
DGE.results_het_db$padj[which(DGE.results_het_db$padj<1e-210)]
## [1] 2.526889e-247  0.000000e+00 1.371387e-215 2.872114e-279

Below we create a heatmap using complexheatmap for the comparison between diabetic and heterozygous mice.

DGEgenes_het_db <- subset(DGE.results_het_db, padj < 0.05) %>% rownames
row.names(DESeq.rlog_het_db) <- rownames(DGE.results_het_db)
rlog.dge_het_db <- DESeq.rlog_het_db[DGEgenes_het_db, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_db), center=TRUE, scale=TRUE))
z.mat <- t(scale(t(rlog.dge_het_wt), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        show_row_name = FALSE,
        cluster_columns = FALSE,
        split=cutGroups,
        column_title = "Heatmap For DB vs. HET Significant Genes")

MA-plots and volcano plots

The MA-plot provides a global view of the differential genes, with the log2 fold change on the y-axis over the mean of normalized counts. Genes that pass the significance threshold (adjusted p.value <0.05) are colored in blue.

plotMA(DGE.results_het_db, alpha = 0.05, main = "Test: p.adj.value < 0.05 DB vs. HET", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we create a volcano plot to show the most significant gene differences in expression between the heterozygous and diabetes mice.

library(EnhancedVolcano)
vp1_het_db <-EnhancedVolcano(DGE.results_het_db,lab =rownames(DGE.results_het_db),x ='log2FoldChange',y ='padj', pCutoff = 0.05, title = "DB vs HET")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
print(vp1_het_db)

Shrinking logFC

Adjusts for types of noise we might see. Tend to be more reliable and more robust. We also find the upregulated and downregulated genes based on the log2 Fold Change for the genes that we have already determined to be significant by their p-value. The total upregulated genes are 1492 and total downregulated genes are 503.

# BiocManager::install('apeglm')
DGE.results.shrink_het_db <- lfcShrink(DESeq.ds_het_db, coef = 2, type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
upregulated_genes_values_het_db <- DGE.results.shrink_het_db$log2FoldChange[DGE.results.shrink_het_db$log2FoldChange > 1]
downregulated_genes_values_het_db <- DGE.results.shrink_het_db$log2FoldChange[DGE.results.shrink_het_db$log2FoldChange < -1]

upregulated_genes_het_db <- row.names(DGE.results.shrink_het_db)[DGE.results.shrink_het_db$log2FoldChange > 1]
upregulated_genes_values_het_db_sorted <- upregulated_genes_values_het_db[sort.list(upregulated_genes_values_het_db)]
top_fifty_upregulated_genes_het_db <- names(upregulated_genes_values_het_db_sorted[1:50])

downregulated_genes_het_db <- row.names(DGE.results.shrink_het_db)[DGE.results.shrink_het_db$log2FoldChange < -1]
downregulated_genes_values_het_db_sorted <- downregulated_genes_values_het_db[sort.list(downregulated_genes_values_het_db)]
top_fifty_downregulated_genes_het_db <- names(downregulated_genes_values_het_db_sorted[1:50])

finding_upregulated_genes_in_significant_genes_het_db <- upregulated_genes_het_db %in% DGEgenes_het_db

final_upregulated_genes_het_db <- upregulated_genes_het_db[finding_upregulated_genes_in_significant_genes_het_db]

finding_downregulated_genes_in_significant_genes_het_db <- downregulated_genes_het_db %in% DGEgenes_het_db
final_downregulated_genes_het_db <- downregulated_genes_het_db[finding_downregulated_genes_in_significant_genes_het_db]
print(length(final_upregulated_genes_het_db))
## [1] 1492
print(length(final_downregulated_genes_het_db))
## [1] 503

Below we create a heatmap using complexheatmap for the top 50 upregulated genes. We see that diabetes sample 2 is more like the heterozygous samples than the other two diabetes samples. This makes sense because it is what we saw from the PCA as well, as diabetes sample 2 was in between the heterozygous and diabetes samples, but closer to the other diabetes samples.

DGEgenes_het_db_top_fifty_upregulated <- top_fifty_upregulated_genes_het_db
rlog.dge_het_db_top_fifty_upregulated <- DESeq.rlog_het_db[DGEgenes_het_db_top_fifty_upregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_db_top_fifty_upregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. HET: Top 50 Upregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we do the same for the top fifty downregulated genes.

DGEgenes_het_db_top_fifty_downregulated <- top_fifty_downregulated_genes_het_db
rlog.dge_het_db_top_fifty_downregulated <- DESeq.rlog_het_db[DGEgenes_het_db_top_fifty_downregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_het_db_top_fifty_downregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. HET: Top 50 Downregulated Genes",
        column_title_gp = gpar(fontsize = 30),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we show the effect of the shrinkage on the MA plot.

plotMA(DGE.results.shrink_het_db,alpha = 0.05, main = "with logFC shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

par(mfrow =c(1,2))
plotMA(DGE.results_het_db, alpha = 0.05,main = "no shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')
DGE.results.shrink_het_db <-lfcShrink(DESeq.ds_het_db, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(DGE.results.shrink_het_db,alpha = 0.05, main = "with logFC shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we show a volcano plot after the logFC shrink operation. We also compare the gene expression differences between the diabetic and heterozygous mice with and without the logFC shrink. We see that with the shrinkage, we find a few more significant genes.

vp2_het_db <-EnhancedVolcano(DGE.results.shrink_het_db,lab =rownames(DGE.results.shrink_het_db),x ='log2FoldChange',y ='padj', pCutoff = 0.05,title = "DB vs HET: with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
library(patchwork)
print(vp2_het_db)

vp1_het_db+vp2_het_db

Using Just Diabetes vs. Wild Type

Below we take the columns corresponding to the diabetic and wild-type mice only for a direct comparison between these two sample types.

## Take only the first 3 and last 3 columns (diabetic and wild type)
readcounts_db_wt <- readcounts[ , c(1:3, 7:9)]

head(readcounts_db_wt)
##                      DIABETES_1 DIABETES_2 DIABETES_3 WT_1 WT_2 WT_3
## ENSMUSG00000102693.1          0          0          0    0    0    0
## ENSMUSG00000064842.1          0          0          0    0    0    0
## ENSMUSG00000051951.5        163        187        169  174  146  166
## ENSMUSG00000102851.1          0          0          0    0    0    0
## ENSMUSG00000103377.1         25         32         12   38   39   38
## ENSMUSG00000104017.1         14         14         11   23   14   19

Below we create a dataframe for the gene information for each sample.

 #let's use the info from our readcounts objects
sample_info_db_wt <- DataFrame(condition_db_wt =gsub("_[0-9]+", "",names(readcounts_db_wt)),row.names =names(readcounts_db_wt) )

sample_info_db_wt # rows contain gene information for each sample
## DataFrame with 6 rows and 1 column
##            condition_db_wt
##                <character>
## DIABETES_1        DIABETES
## DIABETES_2        DIABETES
## DIABETES_3        DIABETES
## WT_1                    WT
## WT_2                    WT
## WT_3                    WT

Generate the DESeqDataSet for wild-type and diabetes only

Below we generate the DESeqDataSet from our counts for the wild-type and diabetic mice.

DESeq.ds_db_wt <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts_db_wt), colData = sample_info_db_wt, design = ~ condition_db_wt)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds_db_wt
## class: DESeqDataSet 
## dim: 55487 6 
## metadata(1): version
## assays(1): counts
## rownames(55487): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000095019.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... WT_2 WT_3
## colData names(1): condition_db_wt
head(counts(DESeq.ds_db_wt))
##                      DIABETES_1 DIABETES_2 DIABETES_3 WT_1 WT_2 WT_3
## ENSMUSG00000102693.1          0          0          0    0    0    0
## ENSMUSG00000064842.1          0          0          0    0    0    0
## ENSMUSG00000051951.5        163        187        169  174  146  166
## ENSMUSG00000102851.1          0          0          0    0    0    0
## ENSMUSG00000103377.1         25         32         12   38   39   38
## ENSMUSG00000104017.1         14         14         11   23   14   19

Below we show the number of reads sequenced for each sample.

colSums(counts(DESeq.ds_db_wt)) %>% barplot

Remove genes with no reads:

dim(DESeq.ds_db_wt)
## [1] 55487     6
keep_genes_db_wt <- rowSums(counts(DESeq.ds_db_wt)) > 0
DESeq.ds_db_wt <- DESeq.ds_db_wt[ keep_genes_db_wt, ]
dim(DESeq.ds_db_wt)
## [1] 28940     6

Below we change the names of the genes from the ensembl IDs to the external gene names. We do this using the getBM package from BiocManager and using the biomaRt library. We first make a character vector of the rownames, corresponding to the Ensemble geneIds. We then use the BioMart database to map the Ensemble geneIds to the common names of the genes for mice (mmusculus). Finally, we replace the rownames of the current readcounts with the updated names. The reference is https://www.biostars.org/p/147351/. The output for mapped_names is a vector of length less than that of the total number of genes in the readcounts matrix. This is due to some Ensembl names not mapping to an external gene name. To incorporate all the genes, we must merge the two dataframes such that the corresponding external gene name is NA for an Ensembl name initially without an external gene name. This allows us to compare our results with the paper.

ens_db_wt <- c(rownames(DESeq.ds_db_wt))
ens_db_wt <- gsub('\\..*', '', ens_db_wt)
mouse = useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
mapped_names <- getBM(attributes=c('ensembl_gene_id',
                      'external_gene_name'),
         values = ens_db_wt,
         mart = mouse)
ens_df_db_wt <- as.data.frame(ens_db_wt)
colnames(ens_df_db_wt) <- "ensembl_gene_id"
idmap_db_wt = merge(x = ens_df_db_wt, y = mapped_names, by="ensembl_gene_id", all.x=TRUE)
idmap_db_wt$external_gene_name <- ifelse(is.na(idmap_db_wt$external_gene_name), idmap_db_wt$ensembl_gene_id, idmap_db_wt$external_gene_name)
row.names(DESeq.ds_db_wt) <- make.names(idmap_db_wt[,2])

Calculating and applying the size factor for diabetic and wild type only

Below we show the size factors of the samples.

DESeq.ds_db_wt <- estimateSizeFactors(DESeq.ds_db_wt)
# calculate SFs, add them to object
plot(sizeFactors(DESeq.ds_db_wt),colSums(counts(DESeq.ds_db_wt)), ylab = "library sizes", xlab = "size factors", cex = .6 )

Reducing the dependence of the variance on the mean

First, we use the rlog function which transforms the read counts to the log2 scale while simultaneously minimizing the difference between samples for rows with small counts and taking differences between library sizes of the samples into account. More specifically, the expression values are modeled in such a manner that their dispersion is not based on the actual variability an individual gene may show across its replicates, instead, its variability is based on the general dispersion-mean trend over the entire dataset.

# this actually generates a different type of object!
DESeq.rlog_db_wt <-rlog(DESeq.ds_db_wt, blind = TRUE)
## set blind = FALSE if the conditions are expected to introduce
## strong differences in a large proportion of the genes

Let’s visually check the results of the rlog transformation for the first diabetes sample and first wild-type sample:

par(mfrow=c(1,2))


## the rlog-transformed counts are stored in the accessor "assay"
plot(assay(DESeq.rlog_db_wt)[,1],assay(DESeq.rlog_db_wt)[,2],cex=.1, main = "rlog transformed",xlab =colnames(assay(DESeq.rlog_db_wt[,1])),ylab =colnames(assay(DESeq.rlog_db_wt[,4])) )

Below we create an assay for the rlog-transform of the readcounts.

rlog.norm.counts_db_wt <-assay(DESeq.rlog_db_wt)

Below we show the effect of the rlog-transformed read counts.

## rlog-transformed read counts
msd_plot_db_wt <- vsn::meanSdPlot( rlog.norm.counts_db_wt, ranks=FALSE, plot = FALSE)
msd_plot_db_wt$gg+ ggtitle("rlog transformation") + coord_cartesian(ylim =c(0,3))

Below we relevel the data so that the reference is the wild-type samples.

DESeq.ds_db_wt
## class: DESeqDataSet 
## dim: 28940 6 
## metadata(1): version
## assays(1): counts
## rownames(28940): Gnai3 Cdc45 ... Gm17315 AC159819.1
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... WT_2 WT_3
## colData names(2): condition_db_wt sizeFactor
DESeq.ds_db_wt$condition_db_wt
## [1] DIABETES DIABETES DIABETES WT       WT       WT      
## Levels: DIABETES WT
DESeq.ds_db_wt$condition_db_wt <- relevel(DESeq.ds_db_wt$condition_db_wt, ref = 'WT')

Now, let’s perform DE analysis!

DESeq.ds_db_wt <- DESeq(DESeq.ds_db_wt)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Below we look at the DE in a new way. We can obtain unfiltered GLM results, i.e. without outlier removal or independent filtering with the following call which results in no p-values having an NA as a result:

DGE.results_db_wt <- results(DESeq.ds_db_wt)
dim(DGE.results_db_wt)
## [1] 28940     6
# Find how many significant genes. There are a large number of genes that have NA because the program knows that they have no chance at being significant and therefore does not even produce an output 
table(DGE.results_db_wt$padj < 0.05, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
## 13450  5948  9542

The top 3 most significant genes and their corresponding adjusted p-values are shown below.

row.names(DGE.results_db_wt)[which(DGE.results_db_wt$padj<1e-210)]
## [1] "Ctrl"    "Frmd8os" "Gm50287"
DGE.results_db_wt$padj[which(DGE.results_db_wt$padj<1e-210)]
## [1] 9.069196e-223  0.000000e+00  0.000000e+00

Below we create a heatmap using complexheatmap for all significant genes.

DGEgenes_db_wt <- subset(DGE.results_db_wt, padj < 0.05) %>% rownames
row.names(DESeq.rlog_db_wt) <- rownames(DGE.results_db_wt)
rlog.dge_db_wt <- DESeq.rlog_db_wt[DGEgenes_db_wt, ] %>% assay
z.mat <- t(scale(t(rlog.dge_db_wt), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        show_row_name = FALSE,
        cluster_columns = FALSE,
        split=cutGroups,
        column_title = "Heatmap For DB vs. WT Significant Genes")

MA-plots and volcano plots

The MA-plot provides a global view of the differential genes, with the log2 fold change on the y-axis over the mean of normalized counts. Genes that pass the significance threshold (adjusted p.value <0.05) are colored in blue.

plotMA(DGE.results_db_wt, alpha = 0.05, main = "Test: p.adj.value < 0.05 DB vs. WT", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we create a volcano plot showing the significant genes with respect to logFC change and p-value in red for the comparison between diabetic and wild-type mice.

library(EnhancedVolcano)
vp1_db_wt <-EnhancedVolcano(DGE.results_db_wt,lab =rownames(DGE.results_db_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05, title = "DB vs WT")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
print(vp1_db_wt)

Shrinking logFC

Adjusts for types of noise we might see and tend to be more reliable and more robust. We also find the upregulated and downregulated genes based on the log2 Fold Change for the genes that we have already determined to be significant by their p-value. We find a total of 1619 upregulated genes and 473 downregulated genes.

# BiocManager::install('apeglm')
DGE.results.shrink_db_wt <- lfcShrink(DESeq.ds_db_wt, coef = 2, type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
upregulated_genes_values_db_wt <- DGE.results.shrink_db_wt$log2FoldChange[DGE.results.shrink_db_wt$log2FoldChange > 1]
upregulated_genes_values_db_wt_sorted <- upregulated_genes_values_db_wt[sort.list(upregulated_genes_values_db_wt)]
top_fifty_upregulated_genes_db_wt <- names(upregulated_genes_values_db_wt_sorted[1:50])

downregulated_genes_values_db_wt <- DGE.results.shrink_db_wt$log2FoldChange[DGE.results.shrink_db_wt$log2FoldChange < -1]
downregulated_genes_values_db_wt_sorted <- downregulated_genes_values_db_wt[sort.list(downregulated_genes_values_db_wt)]
top_fifty_downregulated_genes_db_wt <- names(downregulated_genes_values_db_wt_sorted[1:50])

upregulated_genes_db_wt <- row.names(DGE.results.shrink_db_wt)[DGE.results.shrink_db_wt$log2FoldChange > 1]
downregulated_genes_db_wt <- row.names(DGE.results.shrink_db_wt)[DGE.results.shrink_db_wt$log2FoldChange < -1]

finding_upregulated_genes_in_significant_genes_db_wt <- upregulated_genes_db_wt %in% DGEgenes_db_wt
final_upregulated_genes_db_wt <- upregulated_genes_db_wt[finding_upregulated_genes_in_significant_genes_db_wt]

finding_downregulated_genes_in_significant_genes_db_wt <- downregulated_genes_db_wt %in% DGEgenes_db_wt
final_downregulated_genes_db_wt <- downregulated_genes_db_wt[finding_downregulated_genes_in_significant_genes_db_wt]
print(length(final_upregulated_genes_db_wt))
## [1] 1619
print(length(final_downregulated_genes_db_wt))
## [1] 473

Below we create a heatmap using complexheatmap for the top 50 upregulated genes. We see that diabetes sample 2 is more like the wild type samples than the other two diabetes samples. This makes sense because it is what we saw from the PCA as well, as diabetes sample 2 was in between the wild type and diabetes samples, but closer to the other diabetes samples.

DGEgenes_db_wt_top_fifty_upregulated <- top_fifty_upregulated_genes_db_wt
rlog.dge_db_wt_top_fifty_upregulated <- DESeq.rlog_db_wt[DGEgenes_db_wt_top_fifty_upregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_db_wt_top_fifty_upregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. WT: Top 50 Upregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we create a heatmap using complexheatmap for the top 50 downregulated genes. We see that diabetes sample 2 is more like the wild type samples than the other two diabetes samples. This makes sense because it is what we saw from the PCA as well, as diabetes sample 2 was in between the wild type and diabetes samples, but closer to the other diabetes samples.

DGEgenes_db_wt_top_fifty_downregulated <- top_fifty_downregulated_genes_db_wt
rlog.dge_db_wt_top_fifty_downregulated <- DESeq.rlog_db_wt[DGEgenes_db_wt_top_fifty_downregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_db_wt_top_fifty_downregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. WT: Top 50 Downregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Next we show the effect of the shrinkage on the MA plot:

plotMA(DGE.results.shrink_db_wt,alpha = 0.05, main = "DB vs WT: with logFC shrinkage", ylim =c(-3,3))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

par(mfrow =c(1,2))
plotMA(DGE.results_db_wt, alpha = 0.05,main = "no shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')
DGE.results.shrink_db_wt <-lfcShrink(DESeq.ds_db_wt, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(DGE.results.shrink_db_wt,alpha = 0.05, main = "with logFC shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we show a volcano plot after the logFC shrink operation. We also compare the gene expression differences between the diabetic and wild-type mice with and without the logFC shrink. We see that with the shrinkage, we find a few more significant genes.

vp2_db_wt <-EnhancedVolcano(DGE.results.shrink_db_wt,lab =rownames(DGE.results.shrink_db_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05,title = "DB vs WT: with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
library(patchwork)
print(vp2_db_wt)

vp1_db_wt+vp2_db_wt

GO Term and Pathway Enrichments

Because GO analysis requires us to use the Ensembl gene names, we have to redo everything without using Biomart to convert the gene names to the external gene names. We just need to repeat the same steps as before without changing the gene names. Below we generate the DESeqDataSet from our counts for the wild-type and diabetic mice.

DESeq.ds_db_wt_go <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts_db_wt), colData = sample_info_db_wt, design = ~ condition_db_wt)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds_db_wt_go
## class: DESeqDataSet 
## dim: 55487 6 
## metadata(1): version
## assays(1): counts
## rownames(55487): ENSMUSG00000102693.1 ENSMUSG00000064842.1 ...
##   ENSMUSG00000095019.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... WT_2 WT_3
## colData names(1): condition_db_wt
head(counts(DESeq.ds_db_wt_go))
##                      DIABETES_1 DIABETES_2 DIABETES_3 WT_1 WT_2 WT_3
## ENSMUSG00000102693.1          0          0          0    0    0    0
## ENSMUSG00000064842.1          0          0          0    0    0    0
## ENSMUSG00000051951.5        163        187        169  174  146  166
## ENSMUSG00000102851.1          0          0          0    0    0    0
## ENSMUSG00000103377.1         25         32         12   38   39   38
## ENSMUSG00000104017.1         14         14         11   23   14   19

Remove genes with no reads:

dim(DESeq.ds_db_wt_go)
## [1] 55487     6
keep_genes_db_wt <- rowSums(counts(DESeq.ds_db_wt_go)) > 0
DESeq.ds_db_wt_go <- DESeq.ds_db_wt_go[ keep_genes_db_wt, ]
dim(DESeq.ds_db_wt_go)
## [1] 28940     6

Below we relevel the data so that the reference is the wild-type samples.

DESeq.ds_db_wt_go
## class: DESeqDataSet 
## dim: 28940 6 
## metadata(1): version
## assays(1): counts
## rownames(28940): ENSMUSG00000051951.5 ENSMUSG00000103377.1 ...
##   ENSMUSG00000095742.1 ENSMUSG00000095041.7
## rowData names(0):
## colnames(6): DIABETES_1 DIABETES_2 ... WT_2 WT_3
## colData names(1): condition_db_wt
DESeq.ds_db_wt_go$condition_db_wt
## [1] DIABETES DIABETES DIABETES WT       WT       WT      
## Levels: DIABETES WT
DESeq.ds_db_wt_go$condition_db_wt <- relevel(DESeq.ds_db_wt_go$condition_db_wt, ref = 'WT')

Now, let’s perform DE analysis!

DESeq.ds_db_wt_go <- DESeq(DESeq.ds_db_wt_go)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Below we look at the DE in a new way. We can obtain unfiltered GLM results, i.e. without outlier removal or independent filtering with the following call which results in no p-values having an NA as a result:

DGE.results_db_wt_go <- results(DESeq.ds_db_wt_go)
dim(DGE.results_db_wt_go)
## [1] 28940     6
# Find how many significant genes. There are a large number of genes that have NA because the program knows that they have no chance at being significant and therefore does not even produce an output 
table(DGE.results_db_wt_go$padj < 0.05, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
## 13450  5948  9542

Below we shrink the logFC which adjusts for types of noise we might see and tend to be more reliable and more robust.

# BiocManager::install('apeglm')
DGE.results.shrink_db_wt_go <- lfcShrink(DESeq.ds_db_wt_go, coef = 2, type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

Below we find all significant genes.

DGEgenes_db_wt_go <- subset(DGE.results_db_wt_go, padj < 0.05) %>% rownames

Below we run GO analysis using the biological properties subontology (ont = “BP”). We output a dotplot and graph showing the results.

#BiocManager::install("clusterProfiler")
library(enrichplot)
## Warning: package 'enrichplot' was built under R version 4.0.3
## 
library(org.Mm.eg.db)
## 
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.0.3
## clusterProfiler v3.18.1  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following object is masked from 'package:stats':
## 
##     filter
# Change rownames so that they are readable for the GO analysis by removing all decimals and numbers after
DGEgenes_db_wt_go <- gsub("\\..*","",DGEgenes_db_wt_go)
gene_ontology_results <- enrichGO(gene = DGEgenes_db_wt_go,
                OrgDb         = org.Mm.eg.db,
                keyType       = 'ENSEMBL',
                ont           = "BP",
                pAdjustMethod = "BH")

dotplot(gene_ontology_results, showCategory=30)
## wrong orderBy parameter; set to default `orderBy = "x"`

plotGOgraph(gene_ontology_results)
## Loading required package: topGO
## Warning: package 'topGO' was built under R version 4.0.3
## Loading required package: graph
## Warning: package 'graph' was built under R version 4.0.3
## 
## Attaching package: 'graph'
## The following object is masked from 'package:circlize':
## 
##     degree
## The following object is masked from 'package:stringr':
## 
##     boundary
## Loading required package: GO.db
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 4.0.4
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
## 
## Attaching package: 'topGO'
## The following object is masked from 'package:grid':
## 
##     depth
## The following object is masked from 'package:IRanges':
## 
##     members
## 
## groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
## 
## Building most specific GOs .....
##  ( 12639 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 12639 GO terms and 29364 relations. )
## 
## Annotating nodes ...............
##  ( 21602 genes annotated to the GO terms. )
## Loading required package: Rgraphviz
## Warning: package 'Rgraphviz' was built under R version 4.0.3
## 
## Attaching package: 'Rgraphviz'
## The following objects are masked from 'package:IRanges':
## 
##     from, to
## The following objects are masked from 'package:S4Vectors':
## 
##     from, to

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 47 
## Number of Edges = 74 
## 
## $complete.dag
## [1] "A graph with 47 nodes."

Below we run GO analysis using the cellular components subontology (ont = “CC”). We output a dotplot and graph showing the results.

#BiocManager::install("clusterProfiler")
library(enrichplot)
library(org.Mm.eg.db)
library(clusterProfiler)
# Change rownames so that they are readable for the GO analysis by removing all decimals and numbers after
DGEgenes_db_wt_go <- gsub("\\..*","",DGEgenes_db_wt_go)
gene_ontology_results <- enrichGO(gene = DGEgenes_db_wt_go,
                OrgDb         = org.Mm.eg.db,
                keyType       = 'ENSEMBL',
                ont           = "CC",
                pAdjustMethod = "BH")
dotplot(gene_ontology_results, showCategory=30)
## wrong orderBy parameter; set to default `orderBy = "x"`

plotGOgraph(gene_ontology_results)
## 
## groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
## 
## Building most specific GOs .....
##  ( 1560 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 1560 GO terms and 2661 relations. )
## 
## Annotating nodes ...............
##  ( 21624 genes annotated to the GO terms. )

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 30 
## Number of Edges = 43 
## 
## $complete.dag
## [1] "A graph with 30 nodes."

Below we run GO analysis using the molecular function subontology (ont = “MF”). We output a dotplot and graph showing the results.

#BiocManager::install("clusterProfiler")
library(enrichplot)
library(org.Mm.eg.db)
library(clusterProfiler)
# Change rownames so that they are readable for the GO analysis by removing all decimals and numbers after
DGEgenes_db_wt_go <- gsub("\\..*","",DGEgenes_db_wt_go)
gene_ontology_results <- enrichGO(gene = DGEgenes_db_wt_go,
                OrgDb         = org.Mm.eg.db,
                keyType       = 'ENSEMBL',
                ont           = "MF",
                pAdjustMethod = "BH")
dotplot(gene_ontology_results, showCategory=30)
## wrong orderBy parameter; set to default `orderBy = "x"`

plotGOgraph(gene_ontology_results)
## 
## groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
## 
## Building most specific GOs .....
##  ( 3128 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 3128 GO terms and 4051 relations. )
## 
## Annotating nodes ...............
##  ( 21006 genes annotated to the GO terms. )

## $dag
## A graphNEL graph with directed edges
## Number of Nodes = 23 
## Number of Edges = 25 
## 
## $complete.dag
## [1] "A graph with 23 nodes."

Analysis Using Featurecounts Matrix From Paper

We first import our libraries and featurecounts. We notice that there is one more sample than we had. The first 9 samples correspond to the actual samples used so we need to cut out the last from the analysis.

library(tidyverse)
library(ggplot2)
library(magrittr)
library(DESeq2)
library(hexbin)
library(org.Sc.sgd.db)
library(EnhancedVolcano)
library(patchwork)
readcounts_paper <- paste0("featurecounts_from_paper.txt") %>% read.table(., header=TRUE)

Below we take the columns corresponding to the diabetic and wild-type mice only for a direct comparison between these two sample types. We call the correct columns in so that the first 3 are wild type and the last 3 are diabetic mice. We also keep the gene names column.

## Take only the columns for diabetes and wild-type
readcounts_paper_db_wt <- readcounts_paper[ , c(1, 5:6, 8, 7, 9:10)]

head(readcounts_paper_db_wt)
##          geneid G037 G119 H067 H064 H069 H075
## 1 0610005C13Rik    3    2    2    0    1    0
## 2 0610006L08Rik    1    0    0    0    0    0
## 3 0610009B22Rik  235  227  215  140  195  230
## 4 0610009E02Rik   26   31   32   30   33   54
## 5 0610009L18Rik   55   37   41   23   18   16
## 6 0610009O20Rik  651  637  529  279  334  393

Below we make the geneid values the rownames and Change Column names.

row.names(readcounts_paper_db_wt) <- make.names(readcounts_paper_db_wt$geneid)
readcounts_paper_db_wt[[1]] <- NULL

colnames(readcounts_paper_db_wt) <- c("WT_1", "WT_2", "WT_3", "DIABETES_1", "DIABETES_2", "DIABETES_3")
str(readcounts_paper_db_wt)
## 'data.frame':    52459 obs. of  6 variables:
##  $ WT_1      : int  3 1 235 26 55 651 684 13 0 105 ...
##  $ WT_2      : int  2 0 227 31 37 637 542 8 0 135 ...
##  $ WT_3      : int  2 0 215 32 41 529 506 8 0 119 ...
##  $ DIABETES_1: int  0 0 140 30 23 279 335 2 0 62 ...
##  $ DIABETES_2: int  1 0 195 33 18 334 441 7 0 103 ...
##  $ DIABETES_3: int  0 0 230 54 16 393 466 10 0 98 ...

Below we create a dataframe for the gene information for each sample.

 #let's use the info from our readcounts objects
sample_info_paper_db_wt <- DataFrame(condition_paper_db_wt =gsub("_[0-9]+", "",names(readcounts_paper_db_wt)),row.names =names(readcounts_paper_db_wt) )

sample_info_paper_db_wt # rows contain gene information for each sample
## DataFrame with 6 rows and 1 column
##            condition_paper_db_wt
##                      <character>
## WT_1                          WT
## WT_2                          WT
## WT_3                          WT
## DIABETES_1              DIABETES
## DIABETES_2              DIABETES
## DIABETES_3              DIABETES

Generate the DESeqDataSet for wild-type and diabetes only

Below we generate the DESeqDataSet from our counts for the wild-type and diabetic mice.

DESeq.ds_paper_db_wt <- DESeqDataSetFromMatrix(countData = as.matrix(readcounts_paper_db_wt), colData = sample_info_paper_db_wt, design = ~ condition_paper_db_wt)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
DESeq.ds_paper_db_wt
## class: DESeqDataSet 
## dim: 52459 6 
## metadata(1): version
## assays(1): counts
## rownames(52459): X0610005C13Rik X0610006L08Rik ... n.TSaga9 n.TStga1
## rowData names(0):
## colnames(6): WT_1 WT_2 ... DIABETES_2 DIABETES_3
## colData names(1): condition_paper_db_wt
head(counts(DESeq.ds_paper_db_wt))
##                WT_1 WT_2 WT_3 DIABETES_1 DIABETES_2 DIABETES_3
## X0610005C13Rik    3    2    2          0          1          0
## X0610006L08Rik    1    0    0          0          0          0
## X0610009B22Rik  235  227  215        140        195        230
## X0610009E02Rik   26   31   32         30         33         54
## X0610009L18Rik   55   37   41         23         18         16
## X0610009O20Rik  651  637  529        279        334        393

Below we show the number of reads sequenced for each sample.

colSums(counts(DESeq.ds_paper_db_wt)) %>% barplot

Remove genes with no reads:

dim(DESeq.ds_paper_db_wt)
## [1] 52459     6
keep_genes_paper_db_wt <- rowSums(counts(DESeq.ds_paper_db_wt)) > 0
DESeq.ds_paper_db_wt <- DESeq.ds_paper_db_wt[ keep_genes_paper_db_wt, ]
dim(DESeq.ds_paper_db_wt)
## [1] 28804     6

Calculating and applying the size factor for diabetic and wild type only

Below we show the size factors of the samples.

DESeq.ds_paper_db_wt <- estimateSizeFactors(DESeq.ds_paper_db_wt)
# calculate SFs, add them to object
plot(sizeFactors(DESeq.ds_paper_db_wt),colSums(counts(DESeq.ds_paper_db_wt)), ylab = "library sizes", xlab = "size factors", cex = .6 )

Reducing the dependence of the variance on the mean

First, we use the rlog function which transforms the read counts to the log2 scale while simultaneously minimizing the difference between samples for rows with small counts and taking differences between library sizes of the samples into account. More specifically, the expression values are modeled in such a manner that their dispersion is not based on the actual variability an individual gene may show across its replicates, instead, its variability is based on the general dispersion-mean trend over the entire dataset.

# this actually generates a different type of object!
DESeq.rlog_paper_db_wt <-rlog(DESeq.ds_paper_db_wt, blind = TRUE)
## set blind = FALSE if the conditions are expected to introduce
## strong differences in a large proportion of the genes

Let’s visually check the results of the rlog transformation for the first diabetes sample and first wild-type sample:

par(mfrow=c(1,2))


## the rlog-transformed counts are stored in the accessor "assay"
plot(assay(DESeq.rlog_paper_db_wt)[,1],assay(DESeq.rlog_paper_db_wt)[,2],cex=.1, main = "rlog transformed",xlab =colnames(assay(DESeq.rlog_paper_db_wt[,1])),ylab =colnames(assay(DESeq.rlog_paper_db_wt[,4])) )

Below we create an assay for the rlog-transform of the readcounts.

rlog.norm.counts_paper_db_wt <-assay(DESeq.rlog_paper_db_wt)

Below we show the effect of the rlog-transformed read counts.

## rlog-transformed read counts
msd_plot_paper_db_wt <- vsn::meanSdPlot( rlog.norm.counts_paper_db_wt, ranks=FALSE, plot = FALSE)
msd_plot_paper_db_wt$gg+ ggtitle("rlog transformation") + coord_cartesian(ylim =c(0,3))

Below we relevel the data so that the reference is the wild-type samples.

DESeq.ds_paper_db_wt
## class: DESeqDataSet 
## dim: 28804 6 
## metadata(1): version
## assays(1): counts
## rownames(28804): X0610005C13Rik X0610006L08Rik ... n.R5s95 n.R5s96
## rowData names(0):
## colnames(6): WT_1 WT_2 ... DIABETES_2 DIABETES_3
## colData names(2): condition_paper_db_wt sizeFactor
DESeq.ds_paper_db_wt$condition_paper_db_wt
## [1] WT       WT       WT       DIABETES DIABETES DIABETES
## Levels: DIABETES WT
DESeq.ds_paper_db_wt$condition_paper_db_wt <- relevel(DESeq.ds_paper_db_wt$condition_paper_db_wt, ref = 'WT')

Now, let’s perform DE analysis!

DESeq.ds_paper_db_wt <- DESeq(DESeq.ds_paper_db_wt)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Below we look at the DE in a new way. We can obtain unfiltered GLM results, i.e. without outlier removal or independent filtering with the following call which results in no p-values having an NA as a result:

DGE.results_paper_db_wt <- results(DESeq.ds_paper_db_wt)
dim(DGE.results_paper_db_wt)
## [1] 28804     6
# Find how many significant genes. There are a large number of genes that have NA because the program knows that they have no chance at being significant and therefore does not even produce an output 
table(DGE.results_paper_db_wt$padj < 0.05, useNA = "ifany")
## 
## FALSE  TRUE  <NA> 
## 14290  5575  8939

The top 4 most significant genes and their corresponding adjusted p-values are shown below.

row.names(DGE.results_paper_db_wt)[which(DGE.results_paper_db_wt$padj<1e-210)]
## [1] "Aldh1a3"  "Gc"       "Npas4"    "Serpina7"
DGE.results_paper_db_wt$padj[which(DGE.results_paper_db_wt$padj<1e-210)]
## [1] 3.423244e-294 2.412112e-233 1.134707e-233  0.000000e+00

Below we create a heatmap using complexheatmap for all significant genes.

library(ComplexHeatmap)
DGEgenes_paper_db_wt <- subset(DGE.results_paper_db_wt, padj < 0.05) %>% rownames
row.names(DESeq.rlog_paper_db_wt) <- rownames(DGE.results_paper_db_wt)
rlog.dge_paper_db_wt <- DESeq.rlog_paper_db_wt[DGEgenes_paper_db_wt, ] %>% assay
z.mat <- t(scale(t(rlog.dge_paper_db_wt), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        show_row_name = FALSE,
        cluster_columns = FALSE,
        split=cutGroups,
        column_title = "Heatmap For DB vs. WT Significant Genes")

MA-plots and volcano plots

The MA-plot provides a global view of the differential genes, with the log2 fold change on the y-axis over the mean of normalized counts. Genes that pass the significance threshold (adjusted p.value <0.05) are colored in blue.

DESeq2::plotMA(DGE.results_paper_db_wt, alpha = 0.05, main = "Test: p.adj.value < 0.05 DB vs. WT", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we create a volcano plot to show the most significant gene differences in expression between the heterozygous and diabetes mice.

library(EnhancedVolcano)
vp1_paper_db_wt <-EnhancedVolcano(DGE.results_paper_db_wt,lab =rownames(DGE.results_paper_db_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05, title = "DB vs HET")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
print(vp1_paper_db_wt)

Shrinking logFC

Adjusts for types of noise we might see and tend to be more reliable and more robust. We also find the upregulated and downregulated genes based on the log2 Fold Change for the genes that we have already determined to be significant by their p-value. We find a total of 1619 upregulated genes and 473 downregulated genes.

# BiocManager::install('apeglm')
DGE.results.shrink_paper_db_wt <- lfcShrink(DESeq.ds_paper_db_wt, coef = 2, type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
upregulated_genes_values_paper_db_wt <- DGE.results.shrink_paper_db_wt$log2FoldChange[DGE.results.shrink_paper_db_wt$log2FoldChange > 1]
upregulated_genes_values_paper_db_wt_sorted <- upregulated_genes_values_paper_db_wt[sort.list(upregulated_genes_values_paper_db_wt)]
top_fifty_upregulated_genes_paper_db_wt <- names(upregulated_genes_values_paper_db_wt_sorted[1:50])

downregulated_genes_values_paper_db_wt <- DGE.results.shrink_paper_db_wt$log2FoldChange[DGE.results.shrink_paper_db_wt$log2FoldChange < -1]
downregulated_genes_values_paper_db_wt_sorted <- downregulated_genes_values_paper_db_wt[sort.list(downregulated_genes_values_paper_db_wt)]
top_fifty_downregulated_genes_paper_db_wt <- names(downregulated_genes_values_paper_db_wt_sorted[1:50])

upregulated_genes_paper_db_wt <- row.names(DGE.results.shrink_paper_db_wt)[DGE.results.shrink_paper_db_wt$log2FoldChange > 1]
downregulated_genes_paper_db_wt <- row.names(DGE.results.shrink_paper_db_wt)[DGE.results.shrink_paper_db_wt$log2FoldChange < -1]

finding_upregulated_genes_in_significant_genes_paper_db_wt <- upregulated_genes_paper_db_wt %in% DGEgenes_paper_db_wt
final_upregulated_genes_paper_db_wt <- upregulated_genes_paper_db_wt[finding_upregulated_genes_in_significant_genes_paper_db_wt]

finding_downregulated_genes_in_significant_genes_paper_db_wt <- downregulated_genes_paper_db_wt %in% DGEgenes_paper_db_wt
final_downregulated_genes_paper_db_wt <- downregulated_genes_paper_db_wt[finding_downregulated_genes_in_significant_genes_paper_db_wt]
print(length(final_upregulated_genes_paper_db_wt))
## [1] 1354
print(length(final_downregulated_genes_paper_db_wt))
## [1] 473

In the paper, the gene Gc is included in the Top 50 genes in terms of increased expression. Below we make a table showing that Gc should not be in the top 50 upregulated genes. This suggests to us that the authors included this gene in their top 50 most important genes because they either wanted to specifically show it because they think it is important, or they put a larger weight on adjusted p-value. This suggests that other genes would also be scored differently than just using the log2 FC value for upregulation and downregulation. Therefore, our Top 50 upregulated and Top 50 downregulated genes should not match with those from the paper.

Rank In Upregulated Genes By log2 FC Gene log2 FC padj
1 X1810009J06Rik 14.39679 3.6617e-14
50 X1810006J02Rik 5.245285 4.94998e-23
140 Gc 3.44248 2.41211e-233

Below we create a heatmap using complexheatmap for the top 50 upregulated genes with respect to log FC value. We see that diabetes sample 2 is more like the wild type samples than the other two diabetes samples. This makes sense because it is what we saw from the PCA as well, as diabetes sample 2 was in between the wild type and diabetes samples, but closer to the other diabetes samples.

DGEgenes_paper_db_wt_top_fifty_upregulated <- top_fifty_upregulated_genes_paper_db_wt
rlog.dge_paper_db_wt_top_fifty_upregulated <- DESeq.rlog_paper_db_wt[DGEgenes_paper_db_wt_top_fifty_upregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_paper_db_wt_top_fifty_upregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. WT: Top 50 Upregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we create a heatmap using complexheatmap for the top 50 downregulated genes with respect to log FC value. We see that diabetes sample 2 is more like the wild type samples than the other two diabetes samples. This makes sense because it is what we saw from the PCA as well, as diabetes sample 2 was in between the wild type and diabetes samples, but closer to the other diabetes samples.

DGEgenes_paper_db_wt_top_fifty_downregulated <- top_fifty_downregulated_genes_paper_db_wt
rlog.dge_paper_db_wt_top_fifty_downregulated <- DESeq.rlog_paper_db_wt[DGEgenes_paper_db_wt_top_fifty_downregulated, ] %>% assay
z.mat <- t(scale(t(rlog.dge_paper_db_wt_top_fifty_downregulated), center=TRUE, scale=TRUE))
hcDat <- hclust(dist(z.mat))
cutGroups <- cutree(hcDat, h=4)
Heatmap(z.mat, name = "z-score",
        column_title = "DB vs. WT: Top 50 Downregulated Genes",
        column_title_gp = gpar(fontsize = 10),
        show_row_name = TRUE,
        cluster_columns = FALSE,
        split=cutGroups)

Below we show the logFC values for the top 50 upregulated genes which are also significant by p-value.

all_pval_and_logfc_significant_upregulated_genes <- DGE.results.shrink_paper_db_wt[row.names(DGE.results.shrink_paper_db_wt) %in% final_upregulated_genes_paper_db_wt, ]
sort(all_pval_and_logfc_significant_upregulated_genes$log2FoldChange)[1304:1354]
## X1810006J02Rik          Kcns3         Vstm2a          C1ql3          Pde6a 
##       5.245285       5.393309       5.402010       5.454731       5.478843 
##            Cpz       Atp6v0d2          Bcat1        Slc5a10        Gm34653 
##       5.479167       5.519582       5.526320       5.621662       5.640289 
##       Serpina7           Il10            Il6           Fmod            Mro 
##       5.680781       5.686903       5.755263       5.778463       6.171551 
##        Col10a1          Gpnmb          Klhl1          Dhrs9            Shh 
##       6.184116       6.214617       6.216527       6.224040       6.236200 
##           Gast         Gm5409           Agr2        Gm10177        Gm37301 
##       6.254682       6.321531       6.377081       6.455122       6.479980 
##          Mep1a          Morc1        Cyp17a1         Wnt10a        C1qtnf1 
##       6.486648       6.492698       6.520305       6.555154       6.706034 
##          Reg3g           Gli1         Gm4744          Scn4a           Csn3 
##       6.721743       6.993563       7.044465       7.111826       7.444613 
##            Kel        Gm38306           Ros1      Itih5l.ps        Aldh1a3 
##       7.503326       7.603364       7.616899       7.631445       7.652751 
##         Gm9195          Prss3         Anxa10            Cck        Tm4sf20 
##       8.885589       9.092656       9.185532       9.353554       9.477955 
## X5730412P04Rik        Gm10334         Gm5771          Prss1         Gm2663 
##       9.629238      10.420761      10.549887      10.910048      10.927777 
## X1810009J06Rik 
##      14.396793

Below we show the logFC values for the top 50 downregulated genes which are also significant by p-value.

all_pval_and_logfc_significant_downregulated_genes <- DGE.results.shrink_paper_db_wt[row.names(DGE.results.shrink_paper_db_wt) %in% final_downregulated_genes_paper_db_wt, ]
sort(all_pval_and_logfc_significant_downregulated_genes$log2FoldChange)[1:50]
##     AC159886.1         Kcnj12        Gm44850 X4931431B13Rik         Mcoln3 
##      -6.616388      -5.691289      -5.425301      -5.293121      -5.289771 
##          Trpm5         Lingo3         Cox6a2             T2          Npas4 
##      -5.163788      -5.139483      -4.990010      -4.766953      -4.727331 
##         Gm4791 X4933428C19Rik     AC154352.2         Lrrc38        Fam196a 
##      -4.543815      -4.209290      -4.113718      -4.061423      -4.024698 
##           Edn2          Kcng3           Cdh7 X2410021H03Rik        Cyp4f39 
##      -3.939609      -3.938489      -3.901161      -3.897603      -3.758604 
##           Nnat         Chrna4         Vwa5b1          Myo3a        Fam151a 
##      -3.697980      -3.684943      -3.663756      -3.587900      -3.523127 
##        Gad1.ps          Tdrd6           Gad1           Mafa        Gm45847 
##      -3.441585      -3.307254      -3.157911      -3.151913      -3.067840 
##        Ppp1r1a            Npy         Adarb2        Gm12962         Slc2a2 
##      -2.987624      -2.964349      -2.853704      -2.814216      -2.808916 
##            Arc        Tmem215          Crhr1        Carmil3        Slitrk1 
##      -2.784628      -2.711947      -2.671140      -2.637913      -2.567975 
##         Dlgap1       Tmem229b          Rtbdn        Snord72         Gm8696 
##      -2.563849      -2.539001      -2.522449      -2.466287      -2.459248 
##          Pcsk9         Pcdh10        Slc30a8           Maob        Hspa12a 
##      -2.424653      -2.415488      -2.386814      -2.347396      -2.346101

Next we show the effect of the shrinkage on the MA plot:

DESeq2::plotMA(DGE.results.shrink_paper_db_wt,alpha = 0.05, main = "DB vs WT: with logFC shrinkage", ylim =c(-3,3))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

par(mfrow =c(1,2))
DESeq2::plotMA(DGE.results_paper_db_wt, alpha = 0.05, main = "no shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')
DGE.results.shrink_paper_db_wt <- lfcShrink(DESeq.ds_paper_db_wt, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
DESeq2::plotMA(DGE.results.shrink_paper_db_wt,alpha = 0.05, main = "with logFC shrinkage", ylim =c(-4,4))
abline(h = 1, col = 'red')
abline(h = -1, col = 'red')

Below we show a volcano plot after the logFC shrink operation. We also compare the gene expression differences between the diabetic and wild-type mice with and without the logFC shrink. We see that with the shrinkage, we find a few more significant genes.

vp2_paper_db_wt <-EnhancedVolcano(DGE.results.shrink_paper_db_wt, lab =rownames(DGE.results.shrink_paper_db_wt),x ='log2FoldChange',y ='padj', pCutoff = 0.05,title = "DB vs WT: with logFC shrinkage")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-
## zero p-value...
library(patchwork)
print(vp2_paper_db_wt)

vp1_paper_db_wt + vp2_paper_db_wt